My ML Project

Authors
Affiliation

Name I, First Name I

Name of the University

Name II, First Name II

Published

May 13, 2024

Abstract

The following machine learning project focuses on…

Warning: package 'ggplot2' was built under R version 4.3.1
Warning: package 'dplyr' was built under R version 4.3.1
Warning: package 'patchwork' was built under R version 4.3.1
Warning: package 'maps' was built under R version 4.3.1
Warning: package 'scales' was built under R version 4.3.1
Warning: package 'ggmap' was built under R version 4.3.1
Warning: package 'knitr' was built under R version 4.3.1
Warning: package 'rmarkdown' was built under R version 4.3.1
Warning: package 'lattice' was built under R version 4.3.1
Warning: package 'plotly' was built under R version 4.3.1
Warning: package 'leaflet' was built under R version 4.3.1

1 Introduction

  • Overview and Motivation
  • Related Work
  • Research questions

1.1 Setup

Code
print('hello')
#> [1] "hello"

1.1.1 Libraries

Code
# Python code
import numpy as np
print(np.mean([10, 20, 30, 40, 50]))
#> 30.0

2 Data

  • Sources
  • Description
  • Wrangling/cleaning
  • Spotting mistakes and missing data (could be part of EDA too)
  • Listing anomalies and outliers (could be part of EDA too)

2.1 Wrangling and Cleaning

  • ajouter source
  • ajouter description
  • expliquer blabla
  • Explain why we remove NA from m2 column.
  • Explain …

2.1.1 Raw dataset

Code
properties <- read.csv(file.path(here(),"data/properties.csv"))
# show 1000 first rows of properties using reactable
reactable(head(properties, 1000))
Code

# Create a tibble with cantons and observations
observations_table <- tibble(
  Canton = c("Vaud", "Bern", "Lucerne", "Zurich", "Uri", "Schwyz",
             "Obwalden", "Nidwalden", "Glarus", "St. Gallen", "Grisons", 
             "Aargau", "Thurgau", "Ticino", "Valais", "Neuchatel", 
             "Geneva", "Jura", "Zug", "Fribourg", "Solothurn", 
             "Basel-Stadt", "Basel-Landschaft", "Schaffhausen", 
             "Appenzell-Ausser-Rhoden", "Appenzell-Inner-Rhoden", "Total"),
  Observations = c(3232, 1553, 376, 1191, 71, 93, 29, 51, 55, 757, 405,
                   1481, 553, 4230, 3601, 513, 629, 329, 69, 1242, 590, 
                   149, 705, 118, 102, 12, sum(c(3232, 1553, 376, 1191, 71, 93, 29, 51, 55, 757, 405,
                                               1481, 553, 4230, 3601, 513, 629, 329, 69, 1242, 590, 
                                               149, 705, 118, 102, 12)))
)

# Display the table using kable and kableExtra
observations_table %>%
  kbl(caption = "Number of Observations by Canton") %>%
  kable_styling(position = "center", bootstrap_options = c("striped", "bordered", "hover")) %>%
  add_header_above(c(" " = 1, "Observations" = 1)) # Adds headers spanning columns
Number of Observations by Canton
Observations
Canton Observations
Vaud 3232
Bern 1553
Lucerne 376
Zurich 1191
Uri 71
Schwyz 93
Obwalden 29
Nidwalden 51
Glarus 55
St. Gallen 757
Grisons 405
Aargau 1481
Thurgau 553
Ticino 4230
Valais 3601
Neuchatel 513
Geneva 629
Jura 329
Zug 69
Fribourg 1242
Solothurn 590
Basel-Stadt 149
Basel-Landschaft 705
Schaffhausen 118
Appenzell-Ausser-Rhoden 102
Appenzell-Inner-Rhoden 12
Total 22136

2.1.2 Cleaning

Code
# Identify values causing the issue
problematic_values <- properties$number_of_rooms[is.na(as.numeric(properties$number_of_rooms))]
#> Warning: NAs introduced by coercion
# Replace non-numeric values with NA
properties$number_of_rooms <- as.numeric(gsub("[^0-9.]", "", properties$number_of_rooms))

# Remove non-numeric characters and convert to numeric
properties$price <- as.numeric(gsub("[^0-9]", "", properties$price))

# Subset the dataset to exclude rows with price < 20000
properties <- properties[properties$price >= 20000, ]

# Subset the dataset to exclude rows with numbers of rooms < 25
properties <- properties[properties$number_of_rooms <25, ]

# Replace incomplete addresses
properties$address <- gsub("^\\W*[.,0-]\\W*", "", properties$address)

properties_filtered <- na.omit(properties)

properties_filtered$year_category <- substr(properties_filtered$year_category, 1, 9)
# Assuming 'year_category' is a column in the 'properties' dataset
properties_filtered$year_category <- as.factor(properties_filtered$year_category)

# Preprocess the number_of_rooms column
properties_filtered$number_of_rooms <- as.character(properties_filtered$number_of_rooms)
properties_filtered$number_of_rooms <- gsub("\\D", "", properties_filtered$number_of_rooms)  # Remove non-numeric characters
properties_filtered$number_of_rooms <- as.numeric(properties_filtered$number_of_rooms)       # Convert to numeric
properties_filtered$number_of_rooms <- trunc(properties_filtered$number_of_rooms)             # Truncate non-integer values

# remove m^2 from column 'square_meters'
properties_filtered$square_meters <- as.numeric(gsub("\\D", "", properties_filtered$square_meters))
# print how many NA observations left in square_meters
print(sum(is.na(properties_filtered$square_meters)))
#> [1] 988
# remove NA
properties_filtered <- properties_filtered[!is.na(properties_filtered$square_meters),]
# add majuscule to canton
properties_filtered$canton <- tools::toTitleCase(properties_filtered$canton)

# show 100 first row of cleaned dataset using reactable
reactable(head(properties_filtered, 100))
Code

#filter properties_filtered to contains only 'price', 'number_of_rooms', 'square_meters'
properties_summary <- properties_filtered[, c('price', 'number_of_rooms', 'square_meters')]
#summary statistics
summary(properties_summary)
#>      price          number_of_rooms square_meters 
#>  Min.   :   25000   Min.   :  1.0   Min.   :   1  
#>  1st Qu.:  690000   1st Qu.: 35.0   1st Qu.:  99  
#>  Median :  992340   Median : 45.0   Median : 136  
#>  Mean   : 1349710   Mean   : 41.1   Mean   : 159  
#>  3rd Qu.: 1550000   3rd Qu.: 55.0   3rd Qu.: 190  
#>  Max.   :26149500   Max.   :185.0   Max.   :2000
# Data
data <- data.frame(
  price = c(25000, 690000, 992340, 1348429, 1550000, 26149500),
  number_of_rooms = c(1.0, 35.0, 45.0, 41.1, 55.0, 185.0),
  square_meters = c(1, 98, 136, 159, 190, 2000)
)

# Summary statistics
summary_stats <- summary(data)

# Summary statistics
summary_stats <- cbind(
  Min = apply(data, 2, min),
  Q1 = apply(data, 2, quantile, probs = 0.25),
  Median = apply(data, 2, median),
  Mean = apply(data, 2, mean),
  Q3 = apply(data, 2, quantile, probs = 0.75),
  Max = apply(data, 2, max)
)

# Create table
table <- kbl(summary_stats, align = rep('c', 6), caption = "Summary statistics for the dataset") %>%
  kable_styling(position = "center", bootstrap_options = c("striped", "hover", "condensed", "responsive"))
table
Summary statistics for the dataset
Min Q1 Median Mean Q3 Max
price 25000 765585.0 1170384 5.13e+06 1.50e+06 26149500
number_of_rooms 1 36.5 43 6.04e+01 5.25e+01 185
square_meters 1 107.5 148 4.31e+02 1.82e+02 2000

2.1.3 AMTOVZ_CSV_LV95 Data

The dataset described is the “Official Index of Localities” (Répertoire officiel des localités) provided by the Swiss Federal Office of Topography (swisstopo). It contains comprehensive information about all localities in Switzerland and the Principality of Liechtenstein, including their names, postal codes, and perimeters.

This dataset is regularly updated on a monthly basis, incorporating changes reported by cantonal authorities and Swiss Post. It serves various purposes, including spatial analysis, integration with other geographic datasets, usage as a geolocated background in GIS (Geographic Information Systems) and CAD (Computer-Aided Design) systems, statistical analysis, and as a reference dataset for information systems.

Updates and release notes for the dataset are provided periodically, detailing changes and improvements made over time. The Swiss Federal Office of Topography manages and distributes this dataset as part of its responsibilities in collecting and providing official geospatial data for Switzerland.

Source - swisstopo

2.1.3.1 Creating Variable zip_code and merging with AMTOVZ_CSV_LV95

Code
df <- properties_filtered
#the address column is like : '1844 Villeneuve VD' and has zip code number in it
#taking out the zip code number and creating a new column 'zip_code'
#the way to identify the zip code is to identify numbers that are 4 digits long
df$zip_code <- as.numeric(gsub("\\D", "", df$address))
#removing the first two number of zip code has more than 4 number
df$zip_code <- ifelse(df$zip_code > 9999, df$zip_code %% 10000, df$zip_code)

2.1.3.2 Using AMTOVZ_CSV_LV95 to get the city and canton from the zip code

Code
#read .csv AMTOVZ_CSV_LV95
amto <- read.csv(file.path(here(),"data/AMTOVZ_CSV_WGS84.csv"), sep = ";")
#creating a new dataframe with 'Ortschaftsname' as 'City'Place_name', 'PLZ' as 'zip_code', 'KantonskÃ.rzel' as 'Canton_code', 'E' as 'lon' and 'N' as 'lat'
amto_df <- amto[, c('Gemeindename', 'PLZ', 'Kantonskürzel', 'E', 'N')]
#renaming the columns
colnames(amto_df) <- c('Community', 'zip_code', 'Canton_code', 'lon', 'lat')

#remove duplicates of zip code
amto_df <- amto_df[!duplicated(amto_df$zip_code),]

#add the variable of amto_df to the df if the zip code matches
df <- merge(df, amto_df, by = "zip_code", all.x = TRUE)

#check if there are nan in city
df[is.na(df$Community),]
#>       zip_code    price number_of_rooms square_meters
#> 1           25  2200000              65           165
#> 2           25  2200000              10           263
#> 3           26   655000              35            66
#> 4           26  1995000              75           180
#> 5          322   870000              25            59
#> 6          322   880000              25            55
#> 7          322   975000              35            56
#> 230       1014  1510000              55           146
#> 1137      1200 16092000               7           400
#> 1138      1200  3285450               5           230
#> 1139      1200   679000              55           142
#> 5478      1919   785000              35           103
#> 5479      1919  1908000              65           210
#> 5480      1919  2558620              55           270
#> 5481      1919  1065000              45           130
#> 7621      2500  1100000               5           154
#> 7622      2500   420000              45           115
#> 7623      2500   885500              55           130
#> 7624      2500   872500              45           144
#> 7625      2500   872500              45           138
#> 7626      2500   887500              55           130
#> 7627      2500   870500              45           125
#> 7628      2500   892500              45           144
#> 7629      2500   885500              55           130
#> 7630      2500  1050000              45           121
#> 7631      2500   877500              45           138
#> 7632      2500   887500              45           144
#> 7633      2500  1450000              55           198
#> 8325      3000   820000              55           165
#> 8326      3000  1140000              35           115
#> 8327      3000  1090000              35           115
#> 8328      3000  1090000              55           193
#> 8329      3000   920000              45           157
#> 8330      3000  1090000              55           193
#> 8331      3000  1590000              55           330
#> 8332      3000   720000              35           102
#> 8333      3000   920000              45           157
#> 10434     4000   180000               3            70
#> 10435     4000   975000              45           125
#> 10436     4000  2100000              65           360
#> 12358     5201   725000              35            95
#> 13212     6000   695000              45           133
#> 13964     6511   440000               2            64
#> 14240     6547 15000000              75           220
#> 14558     6602  2800000              75           242
#> 14559     6602  2800000              65           250
#> 14560     6602   270000              15            28
#> 14561     6602   450000              35            75
#> 14562     6604  1990000              45           220
#> 14563     6604  2668590              55           290
#> 14564     6604   760000              35            78
#> 16575     6901  3660930              45           290
#> 16576     6901  3660930              45           290
#> 16577     6903   790000              35           105
#> 16578     6907   995000              45           114
#> 16579     6907   995000              45           114
#> 16580     6911   469350              55           140
#> 16581     6911   610000              35           103
#> 16582     6911   660000              75           200
#> 16583     6911   737550              45            82
#> 17892     7133  2266290              55           160
#> 17901     7135  2690000              85           236
#> 18161     8000  2100000              45           152
#> 18162     8000  1650000              45           142
#> 18163     8000   925000              35           102
#> 18164     8000  1650000              45           142
#> 18165     8000  1150000              45           128
#> 18166     8000  1450000              55           143
#> 18167     8000  1990000              55           200
#> 18168     8000  1990000              55           200
#> 18169     8000   975000              45           122
#> 18170     8000  2495000              55           482
#> 18650     8238   245000               2            49
#> 19074     8423  2110000              65           204
#> 19075     8423  2190000              55           167
#> 20288     9241   545000              45           100
#> 20289     9241   730840              55           130
#>                                                  address
#> 1                                       1000 Lausanne 25
#> 2                                       1000 Lausanne 25
#> 3                                       1000 Lausanne 26
#> 4                          Lausanne 26, 1000 Lausanne 26
#> 5                    Via Cuolm Liung 30d, 7032 Laax GR 2
#> 6                                         7032 Laax GR 2
#> 7                       Via Murschetg 29, 7032 Laax GR 2
#> 230                                        1014 Lausanne
#> 1137                                         1200 Genève
#> 1138                                         1200 Genève
#> 1139  Chemin des pralets, 74100 Etrembières, 1200 Genève
#> 5478                                       1919 Martigny
#> 5479                                       1919 Martigny
#> 5480                                       1919 Martigny
#> 5481                                       1919 Martigny
#> 7621                                    2500 Biel/Bienne
#> 7622                                    2500 Biel/Bienne
#> 7623                                    2500 Biel/Bienne
#> 7624                                    2500 Biel/Bienne
#> 7625                                    2500 Biel/Bienne
#> 7626                                    2500 Biel/Bienne
#> 7627                                    2500 Biel/Bienne
#> 7628                                    2500 Biel/Bienne
#> 7629                                    2500 Biel/Bienne
#> 7630                     Hohlenweg 11b, 2500 Biel/Bienne
#> 7631                                    2500 Biel/Bienne
#> 7632                                    2500 Biel/Bienne
#> 7633                                         2500 Bienne
#> 8325                                           3000 Bern
#> 8326                                           3000 Bern
#> 8327                                           3000 Bern
#> 8328                                           3000 Bern
#> 8329                                           3000 Bern
#> 8330                                           3000 Bern
#> 8331                                           3000 Bern
#> 8332                                           3000 Bern
#> 8333                                           3000 Bern
#> 10434           Lörrach Brombach Steinsack 6, 4000 Basel
#> 10435                                         4000 Basel
#> 10436                                         4000 Basel
#> 12358                                      5201 Brugg AG
#> 13212   in TRIENGEN, ca. 20 min. bei Luzern, 6000 Luzern
#> 13964                                     6511 Cadenazzo
#> 14240                               Augio 1F, 6547 Augio
#> 14558                                       6602 Muralto
#> 14559                                       6602 Muralto
#> 14560                                       6602 Muralto
#> 14561                      Via Bacilieri 2, 6602 Muralto
#> 14562                                       6604 Solduno
#> 14563                                       6604 Solduno
#> 14564                                       6604 Locarno
#> 16575                                        6901 Lugano
#> 16576                                        6901 Lugano
#> 16577                                        6903 Lugano
#> 16578                                      6907 MASSAGNO
#> 16579                                      6907 MASSAGNO
#> 16580                             6911 Campione d'Italia
#> 16581                             6911 Campione d'Italia
#> 16582                             6911 Campione d'Italia
#> 16583                             6911 Campione d'Italia
#> 17892                  Inder Platenga 34, 7133 Obersaxen
#> 17901                                       7135 Fideris
#> 18161                                        8000 Zürich
#> 18162                                        8000 Zürich
#> 18163                                        8000 Zürich
#> 18164                                        8000 Zürich
#> 18165                                        8000 Zürich
#> 18166                                        8000 Zürich
#> 18167                                        8000 Zürich
#> 18168                                        8000 Zürich
#> 18169                                        8000 Zürich
#> 18170                                        8000 Zürich
#> 18650      Stemmerstrasse 14, 8238 Büsingen am Hochrhein
#> 19074                      Chüngstrasse 60, 8423 Embrach
#> 19075                      Chüngstrasse 48, 8423 Embrach
#> 20288                                       9241 Kradolf
#> 20289                                       9241 Kradolf
#>             canton    property_type floor year_category Community
#> 1             Vaud            Villa           2006-2010      <NA>
#> 2             Vaud     Single house           1919-1945      <NA>
#> 3             Vaud        Apartment noteg     2016-2024      <NA>
#> 4             Vaud            Villa           1961-1970      <NA>
#> 5          Grisons        Apartment    eg     2016-2024      <NA>
#> 6          Grisons        Apartment noteg     2016-2024      <NA>
#> 7          Grisons        Apartment noteg     2011-2015      <NA>
#> 230           Vaud        Apartment    eg     2011-2015      <NA>
#> 1137        Geneva     Single house           2011-2015      <NA>
#> 1138        Geneva Bifamiliar house           1981-1990      <NA>
#> 1139        Geneva Bifamiliar house           2016-2024      <NA>
#> 5478        Valais        Apartment noteg     2016-2024      <NA>
#> 5479        Valais        Apartment noteg     2016-2024      <NA>
#> 5480        Valais       Attic flat noteg     2016-2024      <NA>
#> 5481        Valais        Apartment noteg     2016-2024      <NA>
#> 7621          Bern     Single house           2001-2005      <NA>
#> 7622          Bern        Apartment noteg     1971-1980      <NA>
#> 7623          Bern            Villa           2016-2024      <NA>
#> 7624          Bern            Villa           2016-2024      <NA>
#> 7625          Bern     Single house           2016-2024      <NA>
#> 7626          Bern     Single house           2016-2024      <NA>
#> 7627          Bern     Single house           2016-2024      <NA>
#> 7628          Bern     Single house           2016-2024      <NA>
#> 7629          Bern     Single house           2016-2024      <NA>
#> 7630          Bern     Single house           2001-2005      <NA>
#> 7631          Bern     Single house           2016-2024      <NA>
#> 7632          Bern     Single house           2016-2024      <NA>
#> 7633          Bern     Single house           2016-2024      <NA>
#> 8325          Bern        Apartment noteg     2016-2024      <NA>
#> 8326          Bern        Apartment    eg     2016-2024      <NA>
#> 8327          Bern        Apartment    eg     2016-2024      <NA>
#> 8328          Bern        Roof flat noteg     2016-2024      <NA>
#> 8329          Bern        Apartment noteg     2016-2024      <NA>
#> 8330          Bern        Apartment noteg     2016-2024      <NA>
#> 8331          Bern        Apartment noteg     1991-2000      <NA>
#> 8332          Bern        Apartment    eg     2016-2024      <NA>
#> 8333          Bern           Duplex noteg     2016-2024      <NA>
#> 10434  Basel-Stadt     Single house           1961-1970      <NA>
#> 10435  Basel-Stadt     Single house           2016-2024      <NA>
#> 10436  Basel-Stadt            Villa           2016-2024      <NA>
#> 12358       Aargau        Apartment noteg     2016-2024      <NA>
#> 13212      Lucerne        Apartment noteg     1991-2000      <NA>
#> 13964       Ticino        Apartment noteg     2016-2024      <NA>
#> 14240      Grisons     Single house           2016-2024      <NA>
#> 14558       Ticino     Single house           1981-1990      <NA>
#> 14559       Ticino     Single house           1981-1990      <NA>
#> 14560       Ticino        Apartment    eg     1961-1970      <NA>
#> 14561       Ticino        Apartment noteg     1946-1960      <NA>
#> 14562       Ticino       Attic flat noteg     2011-2015      <NA>
#> 14563       Ticino        Apartment noteg     2011-2015      <NA>
#> 14564       Ticino        Apartment noteg     2011-2015      <NA>
#> 16575       Ticino       Attic flat noteg     2011-2015      <NA>
#> 16576       Ticino        Apartment noteg     2011-2015      <NA>
#> 16577       Ticino        Apartment noteg     2006-2010      <NA>
#> 16578       Ticino        Apartment noteg     2016-2024      <NA>
#> 16579       Ticino        Apartment noteg     2016-2024      <NA>
#> 16580       Ticino        Apartment noteg     1946-1960      <NA>
#> 16581       Ticino        Apartment    eg     1946-1960      <NA>
#> 16582       Ticino     Single house           1971-1980      <NA>
#> 16583       Ticino        Apartment noteg     1991-2000      <NA>
#> 17892      Grisons     Single house           2006-2010      <NA>
#> 17901      Grisons     Single house              0-1919      <NA>
#> 18161       Zurich        Apartment noteg     2016-2024      <NA>
#> 18162       Zurich       Attic flat noteg     2016-2024      <NA>
#> 18163       Zurich        Apartment noteg     2016-2024      <NA>
#> 18164       Zurich        Apartment noteg     2016-2024      <NA>
#> 18165       Zurich        Apartment noteg     2016-2024      <NA>
#> 18166       Zurich        Apartment    eg     2016-2024      <NA>
#> 18167       Zurich        Apartment noteg     2006-2010      <NA>
#> 18168       Zurich       Attic flat noteg     2006-2010      <NA>
#> 18169       Zurich     Single house           2016-2024      <NA>
#> 18170       Zurich        Apartment noteg        0-1919      <NA>
#> 18650 Schaffhausen        Apartment noteg     1961-1970      <NA>
#> 19074       Zurich Bifamiliar house           2016-2024      <NA>
#> 19075       Zurich     Single house           2016-2024      <NA>
#> 20288      Thurgau        Apartment noteg     1991-2000      <NA>
#> 20289      Thurgau        Apartment noteg     1991-2000      <NA>
#>       Canton_code lon lat
#> 1            <NA>  NA  NA
#> 2            <NA>  NA  NA
#> 3            <NA>  NA  NA
#> 4            <NA>  NA  NA
#> 5            <NA>  NA  NA
#> 6            <NA>  NA  NA
#> 7            <NA>  NA  NA
#> 230          <NA>  NA  NA
#> 1137         <NA>  NA  NA
#> 1138         <NA>  NA  NA
#> 1139         <NA>  NA  NA
#> 5478         <NA>  NA  NA
#> 5479         <NA>  NA  NA
#> 5480         <NA>  NA  NA
#> 5481         <NA>  NA  NA
#> 7621         <NA>  NA  NA
#> 7622         <NA>  NA  NA
#> 7623         <NA>  NA  NA
#> 7624         <NA>  NA  NA
#> 7625         <NA>  NA  NA
#> 7626         <NA>  NA  NA
#> 7627         <NA>  NA  NA
#> 7628         <NA>  NA  NA
#> 7629         <NA>  NA  NA
#> 7630         <NA>  NA  NA
#> 7631         <NA>  NA  NA
#> 7632         <NA>  NA  NA
#> 7633         <NA>  NA  NA
#> 8325         <NA>  NA  NA
#> 8326         <NA>  NA  NA
#> 8327         <NA>  NA  NA
#> 8328         <NA>  NA  NA
#> 8329         <NA>  NA  NA
#> 8330         <NA>  NA  NA
#> 8331         <NA>  NA  NA
#> 8332         <NA>  NA  NA
#> 8333         <NA>  NA  NA
#> 10434        <NA>  NA  NA
#> 10435        <NA>  NA  NA
#> 10436        <NA>  NA  NA
#> 12358        <NA>  NA  NA
#> 13212        <NA>  NA  NA
#> 13964        <NA>  NA  NA
#> 14240        <NA>  NA  NA
#> 14558        <NA>  NA  NA
#> 14559        <NA>  NA  NA
#> 14560        <NA>  NA  NA
#> 14561        <NA>  NA  NA
#> 14562        <NA>  NA  NA
#> 14563        <NA>  NA  NA
#> 14564        <NA>  NA  NA
#> 16575        <NA>  NA  NA
#> 16576        <NA>  NA  NA
#> 16577        <NA>  NA  NA
#> 16578        <NA>  NA  NA
#> 16579        <NA>  NA  NA
#> 16580        <NA>  NA  NA
#> 16581        <NA>  NA  NA
#> 16582        <NA>  NA  NA
#> 16583        <NA>  NA  NA
#> 17892        <NA>  NA  NA
#> 17901        <NA>  NA  NA
#> 18161        <NA>  NA  NA
#> 18162        <NA>  NA  NA
#> 18163        <NA>  NA  NA
#> 18164        <NA>  NA  NA
#> 18165        <NA>  NA  NA
#> 18166        <NA>  NA  NA
#> 18167        <NA>  NA  NA
#> 18168        <NA>  NA  NA
#> 18169        <NA>  NA  NA
#> 18170        <NA>  NA  NA
#> 18650        <NA>  NA  NA
#> 19074        <NA>  NA  NA
#> 19075        <NA>  NA  NA
#> 20288        <NA>  NA  NA
#> 20289        <NA>  NA  NA

We have 77 NAN, where

  • The zip code was not found in the atmo df
  • The zip code was incorectly isolated from the address

Removed them ::: {.cell layout-align=“center”}

Code
#remove the rows with nan in city
properties_filtered <- df[!is.na(df$Community),]
reactable(head(properties_filtered, 100))

:::

2.1.4 Tax data

  • source https://swisstaxcalculator.estv.admin.ch/#/taxdata/tax-rates
  • ajouter description
  • expliquer blabla

2.1.4.1 Cleaning

Code
# read csv
impots <- read.csv(file.path(here(),"data/estv_income_rates.csv"), sep = ",", header = TRUE, stringsAsFactors = FALSE)

# Remove 1st row
impots <- impots[-1, ]
# Remove 3rd column
impots <- impots[, -3]

# Combine text for columns 4-8
impots[1, 4:8] <- "Impôt sur le revenu"
# Combine text for columns 9-13
impots[1, 9:13] <- "Impôt sur la fortune"
# Combine text for columns 14-16
impots[1, 14:16] <- "Impôt sur le bénéfice"
# Combine text for columns 17-19
impots[1, 17:19] <- "Impôt sur le capital"

# Combine content of the first 2 rows into the 2nd row
impots[2, ] <- apply(impots[1:2, ], 2, function(x) paste(ifelse(is.na(x[1]), x[2], ifelse(is.na(x[2]), x[1], paste(x[1], x[2], sep = " ")))))

# Remove 1st row
impots <- impots[-1, ]

# Assign the text to the 1st row and 1st column
impots[1, 1] <- "Coefficient d'impôt en %"
# Replace column names with the content of the first row
colnames(impots) <- impots[1, ]
impots <- impots[-1, ]

# Check for missing values in impots
any_missing <- any(is.na(impots))

if (any_missing) {
  print("There are missing values in impots.")
} else {
  print("There are no missing values in impots.")
}
#> [1] "There are no missing values in impots."


# Replace row names with the content of the 3rd column
row.names(impots) <- impots[, 3]
impots <- impots[, -3]

# Remove 2nd column (to avoid canton column)
impots <- impots[, -2]
# Remove impot eglise
impots <- impots[, -c(4:6)]
impots <- impots[, -c(6:8)]
impots <- impots[, -8]
impots <- impots[, -10]
# Clean data and convert to numeric
cleaned_impots <- apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))

# Replace NA values with 0
cleaned_impots[is.na(cleaned_impots)] <- 0

# Check for non-numeric values
non_numeric <- sum(!is.na(cleaned_impots) & !is.numeric(cleaned_impots))
if (non_numeric > 0) {
  print(paste("Warning: Found", non_numeric, "non-numeric values."))
}

rownames(cleaned_impots) <- rownames(impots)

#reactable(head(cleaned_impots, 100))

2.1.5 Commune Data

2.1.5.1 Cleaning

  • ajouter source
  • ajouter description
  • expliquer blabla

Replaces NAs in both Taux de couverture social and Political (Conseil National Datas) For Taux de couverture Social: NAs were due to reason “Q” = “Not indicated to protect confidentiality” We replaced the NAs by the average taux de couverture in Switzerland in 2019, which was 3.2%

For Political data: NAs were due to reason “M” = “Not indicated because data was not important or applicable” Therefore, we replaced the NAs by 0

Code
# il faudra changer le path
commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)

# We keep only 2019 to have some reference? (2020 is apparently not really complete)
commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%
  select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))

# delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne
commune_2019 <- subset(commune_2019, STATUS == "A") %>%
  select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))

# on enlève les lignes qui sont des aggrégats
commune_2019 <- subset(commune_2019, REGION != "Schweiz")

commune_2019 <- commune_2019 %>%
  pivot_wider(names_from = INDICATORS, values_from = VALUE)

# Rename columns using the provided map
commune <- commune_2019 %>%
  rename(`Population - Habitants` = Ind_01_01,
         `Population - Densité de la population` = Ind_01_03,
         `Population - Etrangers` = Ind_01_08,
         `Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,
         `Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,
         `Population - Part du groupe d'âge 65+ ans` = Ind_01_06,
         `Population - Taux brut de nuptialité` = Ind_01_09,
         `Population - Taux brut de divortialité` = Ind_01_10,
         `Population - Taux brut de natalité` = Ind_01_11,
         `Population - Taux brut de mortalité` = Ind_01_12,
         `Population - Ménages privés` = Ind_01_13,
         `Population - Taille moyenne des ménages` = Ind_01_14,
         `Sécurité sociale - Taux d'aide sociale` = Ind_11_01,
         `Conseil national - PLR` = Ind_14_01,
         `Conseil national - PDC` = Ind_14_02,
         `Conseil national - PS` = Ind_14_03,
         `Conseil national - UDC` = Ind_14_04,
         `Conseil national - PEV/PCS` = Ind_14_05,
         `Conseil national - PVL` = Ind_14_06,
         `Conseil national - PBD` = Ind_14_07,
         `Conseil national - PST/Sol.` = Ind_14_08,
         `Conseil national - PES` = Ind_14_09,
         `Conseil national - Petits partis de droite` = Ind_14_10)

# If no one voted for a party, set as NA -> replacing it with 0 instead
commune <- commune %>%
  mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))


# Removing NAs from Taux de couverture sociale column
# Setting the mean as the mean for Switzerland in 2019 (3.2%)
mean_taux_aide_social <- 3.2

# Replace NA values with the mean
commune <- commune %>%
  mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))
#show 100 first rows of commune using reactable
reactable(head(commune, 100))
Code

# commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
# 
# # We keep only 2019 to have some reference? (2020 is apparently not really complete)
# commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%
#   select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))
# 
# # delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne
# commune_2019 <- subset(commune_2019, STATUS == "A") %>%
#   select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))
# 
# # on enlève les lignes qui sont des aggrégats
# commune_2019 <- subset(commune_2019, REGION != "Schweiz")
# 
# commune_2019 <- commune_2019 %>%
#   pivot_wider(names_from = INDICATORS, values_from = VALUE)
# 
# # Rename columns using the provided map
# commune <- commune_2019 %>%
#   rename(`Population - Habitants` = Ind_01_01,
#          `Population - Densité de la population` = Ind_01_03,
#          `Population - Etrangers` = Ind_01_08,
#          `Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,
#          `Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,
#          `Population - Part du groupe d'âge 65+ ans` = Ind_01_06,
#          `Population - Taux brut de nuptialité` = Ind_01_09,
#          `Population - Taux brut de divortialité` = Ind_01_10,
#          `Population - Taux brut de natalité` = Ind_01_11,
#          `Population - Taux brut de mortalité` = Ind_01_12,
#          `Population - Ménages privés` = Ind_01_13,
#          `Population - Taille moyenne des ménages` = Ind_01_14,
#          `Sécurité sociale - Taux d'aide sociale` = Ind_11_01,
#          `Conseil national - PLR` = Ind_14_01,
#          `Conseil national - PDC` = Ind_14_02,
#          `Conseil national - PS` = Ind_14_03,
#          `Conseil national - UDC` = Ind_14_04,
#          `Conseil national - PEV/PCS` = Ind_14_05,
#          `Conseil national - PVL` = Ind_14_06,
#          `Conseil national - PBD` = Ind_14_07,
#          `Conseil national - PST/Sol.` = Ind_14_08,
#          `Conseil national - PES` = Ind_14_09,
#          `Conseil national - Petits partis de droite` = Ind_14_10)
# 
# # If no one voted for a party, set as NA -> replacing it with 0 instead
# commune <- commune %>%
#   mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))
# 
# 
# # Removing NAs from Taux de couverture sociale column
# # Setting the mean as the mean for Switzerland in 2019 (3.2%)
# mean_taux_aide_social <- 3.2
# 
# # Replace NA values with the mean
# commune <- commune %>%
#   mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))
# 

3 EDA

3.1 Map representation of distribution of properties

Code
# Create a leaflet map with optimized markers
map <- leaflet(properties_filtered) %>%
  addTiles() %>%  # Add default OpenStreetMap tiles
  addProviderTiles(providers$Esri.NatGeoWorldMap) %>% # Add topographic maps for context
  addCircleMarkers(
    ~lon, ~lat,
    radius = 1.5,  # Smaller radius for the circle markers
    color = "#32012F",  # Specifying a color for the markers
    fillOpacity = 0.2,  # Semi-transparent fill
    stroke = FALSE,  # No border to the circle markers to reduce visual noise
    popup = ~paste("Price: ", price, "<br>",
                   "Rooms: ", number_of_rooms, "<br>",
                   "Type: ", property_type, "<br>",
                   "Year: ", year_category),
    label = ~paste("Price: ", price)  # Tooltip on hover
  ) %>% addLegend(
    position = "bottomright",  # Position the legend at the bottom right
    colors = "#32012F",  # Use the same color as the markers
    labels = "Properties"  # Label for the legend
  )

map$width <- "100%"  # Set the width of the map to 100%
map$height <- 600  # Set the height of the map to 600 pixels

map

3.2 Histogram of prices

Code
histogram_price <- ggplot(properties_filtered, aes(x = price)) +
  geom_histogram(binwidth = 100000, fill = "skyblue", color = "red") +
  labs(title = "Distribution of Prices",
       x = "Price",
       y = "Frequency") +
  theme_minimal()
# Convert ggplot object to plotly object
interactive_histogram_price <- ggplotly(histogram_price, width = 600, height = 400 )
# Display the interactive histogram
interactive_histogram_price

3.3 Histogram of prices for each property type

note : only price between 0 and 500000 so some outliers aren’t here

Code
# Create the ggplot object
histogram <- ggplot(properties_filtered, aes(x = price)) +
  geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
  facet_wrap(~ property_type, scales = "free", ncol = 2) +
  labs(title = "Distribution of Prices by Property Type",
       x = "Price",
       y = "Frequency") +
  theme_minimal() +
  xlim(0, 5000000)

# Convert ggplot object to plotly object
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000)

# Display the interactive plot
interactive_histogram

3.4 Histogram of prices for each year category

note : only price between 0 and 500000 so some outliers aren’t here

Code
# Create a histogram of prices for each year category
histogram <- ggplot(properties_filtered, aes(x = price)) +
  geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
  facet_wrap(~ year_category, scales = "free", ncol = 2) +
  labs(title = "Distribution of Prices by Year Category",
       x = "Price",
       y = "Frequency") +
  theme_minimal() +
  xlim(0, 5000000)
# Convert ggplot object to plotly object
interactive_histogram_year <- ggplotly(histogram, width = 600, height = 1000)
# Display the interactive plot
interactive_histogram_year

3.5 Histogram of prices for each canton

note : only price between 0 and 500000 so some outliers aren’t here

Code
histogram <- ggplot(properties_filtered, aes(x = price)) +
  geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
  facet_wrap(~ canton, scales = "free", ncol = 2) +
  labs(title = "Distribution of Prices by Canton",
       x = "Price",
       y = "Frequency") +
  theme_minimal() +
  xlim(0, 5000000)

# Convert ggplot object to plotly object with adjusted height
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000) %>%
  layout(height = 1000)  # Adjust the height as needed

# Display the interactive plot
interactive_histogram

3.6 Histogram of prices for each number of rooms

note : only price between 0 and 500000 so some outliers aren’t here

and the graph below only show apartments with less than 10 rooms (but you can change the code if needed

Code
properties_room <- properties_filtered[properties_filtered$number_of_rooms < 20, ]                  # Filter only number_of_rooms less than 20

# Create a histogram of prices for each number of rooms
histogram <- ggplot(properties_room, aes(x = price)) +
  geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
  facet_wrap(~ number_of_rooms, scales = "free", ncol = 2) +
  labs(title = "Distribution of Prices by Number of Rooms",
       x = "Price",
       y = "Frequency") +
  theme_minimal() +
  xlim(0, 5000000)

# Convert ggplot object to plotly object with adjusted height
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000)

# Display the interactive plot
interactive_histogram

3.7 Histogram of prices with impot

Code
# colnames(properties_filtered)[(ncol(properties_filtered) - 3):ncol(properties_filtered)] <- gsub("\\s+", "_", colnames(properties_filtered)[(ncol(properties_filtered) - 3):ncol(properties_filtered)])
# 
# # Create a scatter plot to visualize correlation between price and Impôt cantonal
# scatter_plot <- ggplot(properties_filtered, aes(x = price, y = Impôt_cantonal_impots)) +
#   geom_point() +
#   labs(title = "Correlation between Price and Impôt cantonal",
#        x = "Price",
#        y = "Impôt cantonal") +
#   theme_minimal()
# 
# # Convert ggplot object to plotly object
# interactive_plot <- ggplotly(scatter_plot)
# 
# # Display the interactive plot
# interactive_plot
Code
impot_cols <- names(properties_filtered)[startsWith(names(properties_filtered), "Impôt")]

# Count the number of NA values in selected columns
na_counts <- colSums(is.na(properties_filtered[impot_cols]))

# Print the counts
print(na_counts)
#> numeric(0)

4 Unsupervised learning

  • Clustering and/or dimension reduction

Trying to Cluster commune datas to: 1. Reduce dimension 2. See similarities

A regarder, est-ce qu’on fait un cluster pour les datas politques + un cluster pour les data démographiques, ou est-ce qu’on regroupe tout?

Code
set.seed(123)

# Clustering demographic
cols_commune_demographic <- select(commune, -c("REGION", "CODE_REGION","Conseil national - PLR","Conseil national - PDC", "Conseil national - PS", "Conseil national - UDC", "Conseil national - PEV/PCS", "Conseil national - PVL", "Conseil national - PBD", "Conseil national - PST/Sol.", "Conseil national - PES", "Conseil national - Petits partis de droite"))

# Scale the columns, some are total numbers, some are percentages
cols_commune_demographic <- scale(cols_commune_demographic)

# Calculate the distance matrix
dist_matrix_demographic <- dist(cols_commune_demographic, method = "minkowski")

# Perform hierarchical clustering
hclust_model_demographic <- hclust(dist_matrix_demographic, method = "ward.D")

# Create dendrogram
dend_demo <- as.dendrogram(hclust_model_demographic)
dend_demo <- color_branches(dend_demo, k = 5) #Set number of cluster to 5, to keep the same scale for all our variables

plot(dend_demo, main = "Demographics - Hierarchical Clustering Dendrogram")

Code
# Clustering politics
set.seed(123)

cols_commune_politics <- select(commune, c("Conseil national - PLR","Conseil national - PDC", "Conseil national - PS", "Conseil national - UDC", "Conseil national - PEV/PCS", "Conseil national - PVL", "Conseil national - PBD", "Conseil national - PST/Sol.", "Conseil national - PES", "Conseil national - Petits partis de droite"))

# Scale the columns, some are total numbers, some are percentages
cols_commune_politics <- scale(cols_commune_politics)

# Calculate the distance matrix
dist_matrix_politics <- dist(cols_commune_politics, method = "minkowski")

# Perform hierarchical clustering
hclust_model_politics <- hclust(dist_matrix_politics, method = "ward.D")

# Create dendrogram
dend_pol <- as.dendrogram(hclust_model_politics)
dend_pol <- color_branches(dend_pol, k = 5) #Set number of cluster to 5, to keep the same scale for all our variables

plot(dend_pol, main = "Politics - Hierarchical Clustering Dendrogram")

4.1 Tax

Code
set.seed(123)

# Clean data and convert to numeric
cleaned_impots <- apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))
cleaned_impots[is.na(cleaned_impots)] <- 0  # Replace NA values with 0

# Scale the features
scaled_impots <- scale(cleaned_impots)

# Perform k-means clustering
k <- 2  # Initial guess for the number of clusters
kmeans_model <- kmeans(scaled_impots, centers = k)

# Check within-cluster sum of squares (elbow method)
wss <- numeric(10)
for (i in 1:10) {
  kmeans_model <- kmeans(scaled_impots, centers = i)
  wss[i] <- sum(kmeans_model$withinss)
}
# plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")

# Adjust k based on elbow method
k <- 5  

# Perform k-means clustering again with optimal k
kmeans_model <- kmeans(scaled_impots, centers = k)

# Assign cluster labels to dendrogram
clusters <- kmeans_model$cluster

# Plot dendrogram
#colored_dend <- color_branches(dend, k = 5)
#y_zoom_range <- c(0, 80)  # Adjust the y-axis range as needed

#plot(colored_dend, main = "Hierarchical Clustering Dendrogram", horiz = FALSE, ylim = y_zoom_range)
Code
# Get the cluster centers
cluster_centers <- kmeans_model$centers

# Create a data frame with cluster centers
cluster_centers_df <- data.frame(cluster = 1:k, cluster_centers)

# Print cluster centers
print(cluster_centers_df)
#>   cluster Coefficient.d.impôt.en.. Impôt.sur.le.revenu.Canton
#> 1       1                    0.400                     -0.611
#> 2       2                    0.786                     -0.390
#> 3       3                   -0.318                     -0.266
#> 4       4                   -0.792                     -0.699
#> 5       5                   -0.941                      1.932
#>   Impôt.sur.le.revenu.Commune Impôt.sur.la.fortune.Canton
#> 1                     -0.0839                      -0.615
#> 2                     -0.5732                      -0.394
#> 3                      0.9387                      -0.270
#> 4                     -0.3119                      -0.690
#> 5                      1.3332                       1.933
#>   Impôt.sur.la.fortune.Commune Impôt.sur.le.bénéfice.Canton
#> 1                      -0.0849                        1.951
#> 2                      -0.5725                       -0.249
#> 3                       0.9383                       -0.869
#> 4                      -0.3128                       -0.709
#> 5                       1.3330                        1.402
#>   Impôt.sur.le.bénéfice.Commune Impôt.sur.le.capital.Canton
#> 1                       -1.3419                       1.879
#> 2                       -0.6444                      -0.276
#> 3                        1.9896                      -0.874
#> 4                        0.0565                      -0.718
#> 5                        0.8732                       1.492
#>   Impôt.sur.le.capital.Commune
#> 1                     -1.30508
#> 2                     -0.65169
#> 3                      1.81595
#> 4                      0.00514
#> 5                      1.01512

# Calculate the size of each cluster
cluster_sizes <- table(kmeans_model$cluster)

# Print cluster sizes
print(cluster_sizes)
#> 
#>   1   2   3   4   5 
#>  75 999 178 462 417

# Get the cluster labels
cluster_labels <- kmeans_model$cluster

# Convert cleaned_impots to a data frame
impots_cluster <- as.data.frame(cleaned_impots)

# Add the cluster labels to cleaned_impots
impots_cluster$cluster <- cluster_labels

rownames(impots_cluster) <- rownames(impots)

impots_cluster <- impots_cluster %>%
  rownames_to_column(var = "Community")
Code
# Preparing df_commune for merging with main dataset

df_commune <- select(commune, REGION)

df_commune$Demographic_cluster <- cutree(hclust_model_demographic, k = 5)
df_commune$Political_cluster <- cutree(hclust_model_politics, k = 5)

# Preparing to merge

merging <- inner_join(amto_df, df_commune, by = c("Community" = "REGION"))

impots_cluster_subset <- impots_cluster[, c("Community", "cluster")]
merging <- merging %>%
  left_join(impots_cluster_subset, by = "Community")

clusters_df <- merging %>%
  rename(Tax_cluster = cluster) %>%
  rename(Commune = Community)

clusters_df <- clusters_df %>%
  select(c("Commune", "zip_code", "Canton_code", "Demographic_cluster", "Political_cluster", "Tax_cluster"))

# Only NAs are for commune Brugg, (written Brugg (AG) in the other data set) -> j'entre le cluster à la mano
clusters_df$Tax_cluster[is.na(clusters_df$Tax_cluster)] <- 2

# adding it to our main data set:
properties_filtered <- merge(properties_filtered, clusters_df[, c("zip_code", "Demographic_cluster", "Political_cluster", "Tax_cluster")], by = "zip_code", all.x = TRUE)

na_count <- sum(is.na(properties_filtered[, c("Demographic_cluster", "Political_cluster", "Tax_cluster")]))

# Print the result
if (na_count > 0) {
  print("There are NA values in the merged dataframe.")
  print(na_count)
} else {
  print("There are no NA values in the merged dataframe.")
}
#> [1] "There are NA values in the merged dataframe."
#> [1] 678

# Find rows with NA values in the specified columns
na_rows <- subset(properties_filtered, is.na(Demographic_cluster) | is.na(Political_cluster) | is.na(Tax_cluster))

5 Conclusion

  • Brief summary of the project
  • Take home message
  • Limitations
  • Future work?